<!DOCTYPE html>
<html class="writer-html5" lang="en" >
<head>
  <meta charset="utf-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>beyondml.pt.layers.MaskedTransformerDecoderLayer &mdash; BeyondML  documentation</title>
      <link rel="stylesheet" href="../../../../_static/pygments.css" type="text/css" />
      <link rel="stylesheet" href="../../../../_static/css/theme.css" type="text/css" />
  <!--[if lt IE 9]>
    <script src="../../../../_static/js/html5shiv.min.js"></script>
  <![endif]-->
  
        <script src="../../../../_static/jquery.js"></script>
        <script src="../../../../_static/_sphinx_javascript_frameworks_compat.js"></script>
        <script data-url_root="../../../../" id="documentation_options" src="../../../../_static/documentation_options.js"></script>
        <script src="../../../../_static/doctools.js"></script>
        <script src="../../../../_static/sphinx_highlight.js"></script>
    <script src="../../../../_static/js/theme.js"></script>
    <link rel="index" title="Index" href="../../../../genindex.html" />
    <link rel="search" title="Search" href="../../../../search.html" /> 
</head>

<body class="wy-body-for-nav"> 
  <div class="wy-grid-for-nav">
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >

          
          
          <a href="../../../../index.html" class="icon icon-home">
            BeyondML
          </a>
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>
        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
              <p class="caption" role="heading"><span class="caption-text">Documentation:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">beyondml</a></li>
</ul>

        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="../../../../index.html">BeyondML</a>
      </nav>

      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="Page navigation">
  <ul class="wy-breadcrumbs">
      <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
          <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
      <li class="breadcrumb-item active">beyondml.pt.layers.MaskedTransformerDecoderLayer</li>
      <li class="wy-breadcrumbs-aside">
      </li>
  </ul>
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
             
  <h1>Source code for beyondml.pt.layers.MaskedTransformerDecoderLayer</h1><div class="highlight"><pre>
<span></span><span class="kn">import</span> <span class="nn">torch</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Any</span><span class="p">,</span> <span class="n">Union</span><span class="p">,</span> <span class="n">Callable</span>
<span class="kn">from</span> <span class="nn">torch.nn</span> <span class="kn">import</span> <span class="n">functional</span> <span class="k">as</span> <span class="n">F</span>
<span class="kn">from</span> <span class="nn">torch</span> <span class="kn">import</span> <span class="n">Tensor</span>
<span class="kn">from</span> <span class="nn">torch.nn</span> <span class="kn">import</span> <span class="n">Dropout</span><span class="p">,</span> <span class="n">LayerNorm</span>
<span class="kn">from</span> <span class="nn">beyondml.pt.layers</span> <span class="kn">import</span> <span class="n">MaskedDense</span>
<span class="kn">from</span> <span class="nn">.MaskedMultiHeadAttention</span> <span class="kn">import</span> <span class="n">MaskedMultiHeadAttention</span>


<div class="viewcode-block" id="MaskedTransformerDecoderLayer"><a class="viewcode-back" href="../../../../beyondml.pt.layers.html#beyondml.pt.layers.MaskedTransformerDecoderLayer.MaskedTransformerDecoderLayer">[docs]</a><span class="k">class</span> <span class="nc">MaskedTransformerDecoderLayer</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
<span class="w">    </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network.</span>
<span class="sd">    This standard decoder layer is based on the paper &quot;Attention Is All You Need&quot;.</span>
<span class="sd">    Args:</span>
<span class="sd">        d_model: the number of expected features in the input (required).</span>
<span class="sd">        nhead: the number of heads in the multiheadattention models (required).</span>
<span class="sd">        dim_feedforward: the dimension of the feedforward network model (default=2048).</span>
<span class="sd">        dropout: the dropout value (default=0.1).</span>
<span class="sd">        activation: the activation function of the intermediate layer, can be a string</span>
<span class="sd">            (&quot;relu&quot; or &quot;gelu&quot;) or a unary callable. Default: relu</span>
<span class="sd">        layer_norm_eps: the eps value in layer normalization components (default=1e-5).</span>
<span class="sd">        batch_first: If ``True``, then the input and output tensors are provided</span>
<span class="sd">            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).</span>
<span class="sd">        norm_first: if ``True``, layer norm is done prior to self attention, multihead</span>
<span class="sd">            attention and feedforward operations, respectively. Otherwise it&#39;s done after.</span>
<span class="sd">            Default: ``False`` (after).</span>
<span class="sd">    &quot;&quot;&quot;</span>
    <span class="n">__constants__</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;batch_first&#39;</span><span class="p">,</span> <span class="s1">&#39;norm_first&#39;</span><span class="p">]</span>

    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span>
                 <span class="n">d_model</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
                 <span class="n">nhead</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
                 <span class="n">dim_feedforward</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2048</span><span class="p">,</span>
                 <span class="n">dropout</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.1</span><span class="p">,</span>
                 <span class="n">activation</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Tensor</span><span class="p">],</span> <span class="n">Tensor</span><span class="p">]]</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">relu</span><span class="p">,</span>
                 <span class="n">layer_norm_eps</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1e-5</span><span class="p">,</span> <span class="n">batch_first</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
                 <span class="n">norm_first</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
                 <span class="n">device</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
                 <span class="n">dtype</span><span class="o">=</span><span class="kc">None</span>
                 <span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>

        <span class="n">factory_kwargs</span> <span class="o">=</span> <span class="p">{</span><span class="s1">&#39;device&#39;</span><span class="p">:</span> <span class="n">device</span><span class="p">,</span> <span class="s1">&#39;dtype&#39;</span><span class="p">:</span> <span class="n">dtype</span><span class="p">}</span>

        <span class="nb">super</span><span class="p">(</span><span class="n">MaskedTransformerDecoderLayer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>

        <span class="bp">self</span><span class="o">.</span><span class="n">activation</span> <span class="o">=</span> <span class="n">activation</span>

        <span class="bp">self</span><span class="o">.</span><span class="n">self_attn</span> <span class="o">=</span> <span class="n">MaskedMultiHeadAttention</span><span class="p">(</span>
            <span class="n">d_model</span><span class="p">,</span>
            <span class="n">nhead</span><span class="p">,</span>
            <span class="n">dropout</span><span class="o">=</span><span class="n">dropout</span><span class="p">,</span>
            <span class="n">batch_first</span><span class="o">=</span><span class="n">batch_first</span><span class="p">,</span>
            <span class="o">**</span><span class="n">factory_kwargs</span>
        <span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">multihead_attn</span> <span class="o">=</span> <span class="n">MaskedMultiHeadAttention</span><span class="p">(</span>
            <span class="n">d_model</span><span class="p">,</span>
            <span class="n">nhead</span><span class="p">,</span>
            <span class="n">dropout</span><span class="o">=</span><span class="n">dropout</span><span class="p">,</span>
            <span class="n">batch_first</span><span class="o">=</span><span class="n">batch_first</span><span class="p">,</span>
            <span class="o">**</span><span class="n">factory_kwargs</span>
        <span class="p">)</span>

        <span class="c1"># Implementation of Feedforward model</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">linear1</span> <span class="o">=</span> <span class="n">MaskedDense</span><span class="p">(</span><span class="n">d_model</span><span class="p">,</span> <span class="n">dim_feedforward</span><span class="p">,</span> <span class="o">**</span><span class="n">factory_kwargs</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">dropout</span> <span class="o">=</span> <span class="n">Dropout</span><span class="p">(</span><span class="n">dropout</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">linear2</span> <span class="o">=</span> <span class="n">MaskedDense</span><span class="p">(</span><span class="n">dim_feedforward</span><span class="p">,</span> <span class="n">d_model</span><span class="p">,</span> <span class="o">**</span><span class="n">factory_kwargs</span><span class="p">)</span>

        <span class="bp">self</span><span class="o">.</span><span class="n">norm_first</span> <span class="o">=</span> <span class="n">norm_first</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">norm1</span> <span class="o">=</span> <span class="n">LayerNorm</span><span class="p">(</span><span class="n">d_model</span><span class="p">,</span> <span class="n">eps</span><span class="o">=</span><span class="n">layer_norm_eps</span><span class="p">,</span> <span class="o">**</span><span class="n">factory_kwargs</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">norm2</span> <span class="o">=</span> <span class="n">LayerNorm</span><span class="p">(</span><span class="n">d_model</span><span class="p">,</span> <span class="n">eps</span><span class="o">=</span><span class="n">layer_norm_eps</span><span class="p">,</span> <span class="o">**</span><span class="n">factory_kwargs</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">norm3</span> <span class="o">=</span> <span class="n">LayerNorm</span><span class="p">(</span><span class="n">d_model</span><span class="p">,</span> <span class="n">eps</span><span class="o">=</span><span class="n">layer_norm_eps</span><span class="p">,</span> <span class="o">**</span><span class="n">factory_kwargs</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">dropout1</span> <span class="o">=</span> <span class="n">Dropout</span><span class="p">(</span><span class="n">dropout</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">dropout2</span> <span class="o">=</span> <span class="n">Dropout</span><span class="p">(</span><span class="n">dropout</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">dropout3</span> <span class="o">=</span> <span class="n">Dropout</span><span class="p">(</span><span class="n">dropout</span><span class="p">)</span>

    <span class="k">def</span> <span class="nf">__setstate__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">state</span><span class="p">):</span>
        <span class="k">if</span> <span class="s1">&#39;activation&#39;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">state</span><span class="p">:</span>
            <span class="n">state</span><span class="p">[</span><span class="s1">&#39;activation&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">relu</span>
        <span class="nb">super</span><span class="p">(</span><span class="n">MaskedTransformerDecoderLayer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__setstate__</span><span class="p">(</span><span class="n">state</span><span class="p">)</span>

<div class="viewcode-block" id="MaskedTransformerDecoderLayer.forward"><a class="viewcode-back" href="../../../../beyondml.pt.layers.html#beyondml.pt.layers.MaskedTransformerDecoderLayer.MaskedTransformerDecoderLayer.forward">[docs]</a>    <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tgt</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">memory</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">):</span>
<span class="w">        </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;Pass the inputs (and mask) through the decoder layer.</span>
<span class="sd">        Args:</span>
<span class="sd">            tgt: the sequence to the decoder layer.</span>
<span class="sd">            memory: the sequence from the last layer of the encoder.</span>
<span class="sd">        Shape:</span>
<span class="sd">            see the docs in Pytorch Transformer class.</span>
<span class="sd">        &quot;&quot;&quot;</span>

        <span class="n">x</span> <span class="o">=</span> <span class="n">tgt</span>
        <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sa_block</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">memory</span><span class="p">)</span>
        <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_mha_block</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">memory</span><span class="p">)</span>
        <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_ff_block</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
        <span class="k">return</span> <span class="n">x</span></div>

    <span class="c1"># self-attention block</span>

    <span class="k">def</span> <span class="nf">_sa_block</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span>
                  <span class="n">attn_mask</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Tensor</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">key_padding_mask</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Tensor</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
        <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">self_attn</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">x</span><span class="p">,</span> <span class="n">x</span><span class="p">,</span>
                           <span class="n">attn_mask</span><span class="o">=</span><span class="n">attn_mask</span><span class="p">,</span>
                           <span class="n">key_padding_mask</span><span class="o">=</span><span class="n">key_padding_mask</span><span class="p">,</span>
                           <span class="n">need_weights</span><span class="o">=</span><span class="kc">False</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">dropout1</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>

    <span class="c1"># multihead attention block</span>
    <span class="k">def</span> <span class="nf">_mha_block</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">mem</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span>
                   <span class="n">attn_mask</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Tensor</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">key_padding_mask</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Tensor</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
        <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">multihead_attn</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">mem</span><span class="p">,</span> <span class="n">mem</span><span class="p">,</span>
                                <span class="n">attn_mask</span><span class="o">=</span><span class="n">attn_mask</span><span class="p">,</span>
                                <span class="n">key_padding_mask</span><span class="o">=</span><span class="n">key_padding_mask</span><span class="p">,</span>
                                <span class="n">need_weights</span><span class="o">=</span><span class="kc">False</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">dropout2</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>

    <span class="c1"># feed forward block</span>
    <span class="k">def</span> <span class="nf">_ff_block</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
        <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">linear2</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dropout</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">activation</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">linear1</span><span class="p">(</span><span class="n">x</span><span class="p">))))</span>
        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">dropout3</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>

    <span class="k">def</span> <span class="nf">_get_activation_fn</span><span class="p">(</span><span class="n">activation</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Tensor</span><span class="p">],</span> <span class="n">Tensor</span><span class="p">]:</span>
        <span class="k">if</span> <span class="n">activation</span> <span class="o">==</span> <span class="s2">&quot;relu&quot;</span><span class="p">:</span>
            <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">relu</span>
        <span class="k">elif</span> <span class="n">activation</span> <span class="o">==</span> <span class="s2">&quot;gelu&quot;</span><span class="p">:</span>
            <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">gelu</span>

        <span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span>
            <span class="s2">&quot;activation should be relu/gelu, not </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">activation</span><span class="p">))</span>

<div class="viewcode-block" id="MaskedTransformerDecoderLayer.prune"><a class="viewcode-back" href="../../../../beyondml.pt.layers.html#beyondml.pt.layers.MaskedTransformerDecoderLayer.MaskedTransformerDecoderLayer.prune">[docs]</a>    <span class="k">def</span> <span class="nf">prune</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">percentile</span><span class="p">):</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">self_attn</span><span class="o">.</span><span class="n">prune</span><span class="p">(</span><span class="n">percentile</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">multihead_attn</span><span class="o">.</span><span class="n">prune</span><span class="p">(</span><span class="n">percentile</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">linear1</span><span class="o">.</span><span class="n">prune</span><span class="p">(</span><span class="n">percentile</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">linear2</span><span class="o">.</span><span class="n">prune</span><span class="p">(</span><span class="n">percentile</span><span class="p">)</span></div></div>
</pre></div>

           </div>
          </div>
          <footer>

  <hr/>

  <div role="contentinfo">
    <p>&#169; Copyright 2023, BeyondML Labs.</p>
  </div>

  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
    provided by <a href="https://readthedocs.org">Read the Docs</a>.
   

</footer>
        </div>
      </div>
    </section>
  </div>
  <script>
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script> 

</body>
</html>